-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SelectionDAG] Fold undemanded operand to UNDEF for VECTOR_SHUFFLE #145524
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Always let SimplifyDemandedVectorElts fold either side of a VECTOR_SHUFFLE to UNDEF if no elements are demanded from that side. For a single use this could be done by SimplifyDemandedVectorElts already, but in case the operand had multiple uses we did not eliminate the use.
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-backend-x86 Author: Björn Pettersson (bjope) ChangesAlways let SimplifyDemandedVectorElts fold either side of a VECTOR_SHUFFLE to UNDEF if no elements are demanded from that side. For a single use this could be done by SimplifyDemandedVectorElts already, but in case the operand had multiple uses we did not eliminate the use. Full diff: https://github.com/llvm/llvm-project/pull/145524.diff 6 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 66717135c9adf..e40a592ecb57c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3587,6 +3587,19 @@ bool TargetLowering::SimplifyDemandedVectorElts(
DemandedRHS.setBit(M - NumElts);
}
+ // If either side isn't demanded, replace it by UNDEF. We handle this
+ // explicitly here to also simplify in case of mulitple uses (on the
+ // contrary to the SimplifyDemandedVectorElts calls below).
+ bool FoldLHS = !DemandedLHS && !LHS.isUndef();
+ bool FoldRHS = !DemandedRHS && !RHS.isUndef();
+ if (FoldLHS || FoldRHS) {
+ LHS = FoldLHS ? TLO.DAG.getUNDEF(LHS.getValueType()) : LHS;
+ RHS = FoldRHS ? TLO.DAG.getUNDEF(RHS.getValueType()) : RHS;
+ SDValue NewOp =
+ TLO.DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, ShuffleMask);
+ return TLO.CombineTo(Op, NewOp);
+ }
+
// See if we can simplify either shuffle operand.
APInt UndefLHS, ZeroLHS;
APInt UndefRHS, ZeroRHS;
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll
index 008e19b620520..5914253b5f58e 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll
@@ -1228,51 +1228,49 @@ define void @v_shuffle_v3bf16_v2bf16__3_u_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v1
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v2
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v0, 0
; GFX900-NEXT: v_alignbit_b32 v2, s4, v2, 16
-; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v0, v2, s[16:17]
-; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v1
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: v_alignbit_b32 v2, s4, v2, 16
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v0, v2, s[16:17]
-; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v1
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v2
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_alignbit_b32 v2, s0, v2, 16
-; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x bfloat> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll
index 99c9480adc410..cd4dbe93e8a11 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll
@@ -1928,48 +1928,45 @@ define void @v_shuffle_v3bf16_v3bf16__5_u_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v3, 0
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[1:2]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v3, v2, s[16:17]
-; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v4, v3, s[16:17]
-; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v4, v3, s[0:1]
-; GFX942-NEXT: global_store_short v4, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x bfloat> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll
index e34becc1065ff..99cb8a38f57c3 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll
@@ -1228,51 +1228,49 @@ define void @v_shuffle_v3f16_v2f16__3_u_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_u_1:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v1
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v2
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v0, 0
; GFX900-NEXT: v_alignbit_b32 v2, s4, v2, 16
-; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v0, v2, s[16:17]
-; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_u_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v1
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: v_alignbit_b32 v2, s4, v2, 16
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v0, v2, s[16:17]
-; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f16_v2f16__3_u_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v1
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v2
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_alignbit_b32 v2, s0, v2, 16
-; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x half> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll
index 84d42c882494c..0854ff2ebfc5d 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll
@@ -1928,48 +1928,45 @@ define void @v_shuffle_v3f16_v3f16__5_u_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_u_1:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v3, 0
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[1:2]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v3, v2, s[16:17]
-; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_u_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v4, v3, s[16:17]
-; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f16_v3f16__5_u_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v4, v3, s[0:1]
-; GFX942-NEXT: global_store_short v4, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x half> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index 62ab5d82bfbb6..910dd1ee6c419 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -2099,21 +2099,19 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psrlq $1, %xmm2
; SSE41-NEXT: por %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: pextrq $1, %xmm1, %rax
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm0
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: cvtsi2ss %rax, %xmm3
+; SSE41-NEXT: movq %xmm0, %rax
; SSE41-NEXT: xorps %xmm2, %xmm2
; SSE41-NEXT: cvtsi2ss %rax, %xmm2
-; SSE41-NEXT: movq %xmm1, %rax
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
-; SSE41-NEXT: movaps %xmm1, %xmm2
-; SSE41-NEXT: addps %xmm1, %xmm2
-; SSE41-NEXT: xorps %xmm3, %xmm3
-; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[2,3]
-; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],zero,zero
+; SSE41-NEXT: movaps %xmm2, %xmm3
+; SSE41-NEXT: addps %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: uitofp_4i64_to_4f32_undef:
|
@llvm/pr-subscribers-llvm-selectiondag Author: Björn Pettersson (bjope) ChangesAlways let SimplifyDemandedVectorElts fold either side of a VECTOR_SHUFFLE to UNDEF if no elements are demanded from that side. For a single use this could be done by SimplifyDemandedVectorElts already, but in case the operand had multiple uses we did not eliminate the use. Full diff: https://github.com/llvm/llvm-project/pull/145524.diff 6 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 66717135c9adf..e40a592ecb57c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3587,6 +3587,19 @@ bool TargetLowering::SimplifyDemandedVectorElts(
DemandedRHS.setBit(M - NumElts);
}
+ // If either side isn't demanded, replace it by UNDEF. We handle this
+ // explicitly here to also simplify in case of mulitple uses (on the
+ // contrary to the SimplifyDemandedVectorElts calls below).
+ bool FoldLHS = !DemandedLHS && !LHS.isUndef();
+ bool FoldRHS = !DemandedRHS && !RHS.isUndef();
+ if (FoldLHS || FoldRHS) {
+ LHS = FoldLHS ? TLO.DAG.getUNDEF(LHS.getValueType()) : LHS;
+ RHS = FoldRHS ? TLO.DAG.getUNDEF(RHS.getValueType()) : RHS;
+ SDValue NewOp =
+ TLO.DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, ShuffleMask);
+ return TLO.CombineTo(Op, NewOp);
+ }
+
// See if we can simplify either shuffle operand.
APInt UndefLHS, ZeroLHS;
APInt UndefRHS, ZeroRHS;
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll
index 008e19b620520..5914253b5f58e 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll
@@ -1228,51 +1228,49 @@ define void @v_shuffle_v3bf16_v2bf16__3_u_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v1
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v2
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v0, 0
; GFX900-NEXT: v_alignbit_b32 v2, s4, v2, 16
-; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v0, v2, s[16:17]
-; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v1
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: v_alignbit_b32 v2, s4, v2, 16
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v0, v2, s[16:17]
-; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v1
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v2
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_alignbit_b32 v2, s0, v2, 16
-; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x bfloat> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll
index 99c9480adc410..cd4dbe93e8a11 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll
@@ -1928,48 +1928,45 @@ define void @v_shuffle_v3bf16_v3bf16__5_u_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v3, 0
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[1:2]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v3, v2, s[16:17]
-; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v4, v3, s[16:17]
-; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v4, v3, s[0:1]
-; GFX942-NEXT: global_store_short v4, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x bfloat> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll
index e34becc1065ff..99cb8a38f57c3 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll
@@ -1228,51 +1228,49 @@ define void @v_shuffle_v3f16_v2f16__3_u_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_u_1:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v1
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v2
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v0, 0
; GFX900-NEXT: v_alignbit_b32 v2, s4, v2, 16
-; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v0, v2, s[16:17]
-; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_u_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v1
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: v_alignbit_b32 v2, s4, v2, 16
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v0, v2, s[16:17]
-; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f16_v2f16__3_u_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v1
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v2
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_alignbit_b32 v2, s0, v2, 16
-; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x half> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll
index 84d42c882494c..0854ff2ebfc5d 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll
@@ -1928,48 +1928,45 @@ define void @v_shuffle_v3f16_v3f16__5_u_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_u_1:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v3, 0
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[1:2]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v3, v2, s[16:17]
-; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_u_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v4, v3, s[16:17]
-; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f16_v3f16__5_u_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v4, v3, s[0:1]
-; GFX942-NEXT: global_store_short v4, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x half> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index 62ab5d82bfbb6..910dd1ee6c419 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -2099,21 +2099,19 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psrlq $1, %xmm2
; SSE41-NEXT: por %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: pextrq $1, %xmm1, %rax
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm0
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: cvtsi2ss %rax, %xmm3
+; SSE41-NEXT: movq %xmm0, %rax
; SSE41-NEXT: xorps %xmm2, %xmm2
; SSE41-NEXT: cvtsi2ss %rax, %xmm2
-; SSE41-NEXT: movq %xmm1, %rax
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
-; SSE41-NEXT: movaps %xmm1, %xmm2
-; SSE41-NEXT: addps %xmm1, %xmm2
-; SSE41-NEXT: xorps %xmm3, %xmm3
-; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[2,3]
-; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],zero,zero
+; SSE41-NEXT: movaps %xmm2, %xmm3
+; SSE41-NEXT: addps %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: uitofp_4i64_to_4f32_undef:
|
@@ -3587,6 +3587,19 @@ bool TargetLowering::SimplifyDemandedVectorElts( | |||
DemandedRHS.setBit(M - NumElts); | |||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(side note) we have llvm::getShuffleDemandedElts to do this for us now
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with one trivial
@@ -3587,6 +3587,19 @@ bool TargetLowering::SimplifyDemandedVectorElts( | |||
DemandedRHS.setBit(M - NumElts); | |||
} | |||
|
|||
// If either side isn't demanded, replace it by UNDEF. We handle this | |||
// explicitly here to also simplify in case of mulitple uses (on the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
mulitple -> multiple
Always let SimplifyDemandedVectorElts fold either side of a VECTOR_SHUFFLE to UNDEF if no elements are demanded from that side.
For a single use this could be done by SimplifyDemandedVectorElts already, but in case the operand had multiple uses we did not eliminate the use.